library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)

Education

Is there an association between the percentage of people who graduated high school and the incidence rate 1) across neighborhoods in NYC and 2) across neighborhoods in each borough?

data_clean <- read.csv("data_final.csv")
# List all column names
names(data_clean)
##  [1] "INCIDENT_KEY"               "OCCUR_DATE"                
##  [3] "OCCUR_TIME"                 "BORO"                      
##  [5] "LOC_OF_OCCUR_DESC"          "PRECINCT"                  
##  [7] "JURISDICTION_CODE"          "LOC_CLASSFCTN_DESC"        
##  [9] "LOCATION_DESC"              "STATISTICAL_MURDER_FLAG"   
## [11] "PERP_AGE_GROUP"             "PERP_SEX"                  
## [13] "PERP_RACE"                  "VIC_AGE_GROUP"             
## [15] "VIC_SEX"                    "VIC_RACE"                  
## [17] "X_COORD_CD"                 "Y_COORD_CD"                
## [19] "Latitude"                   "Longitude"                 
## [21] "Lon_Lat"                    "Neighborhood"              
## [23] "neighbourhood_group"        "NTA"                       
## [25] "Is_Holiday"                 "Year"                      
## [27] "Month"                      "OCCUR_DATETIME"            
## [29] "Sky_Is_Dark"                "NTAType"                   
## [31] "Total_population_nta"       "CDTA"                      
## [33] "Number_poverty"             "Percent_poverty"           
## [35] "Number_education"           "Percent_education"         
## [37] "incident_rate_by_year_nta"  "total_population_boro"     
## [39] "incident_rate_by_year_boro"
# Check the structure of the data frame
str(data_clean)
## 'data.frame':    9820 obs. of  39 variables:
##  $ INCIDENT_KEY              : int  244608249 247542571 202853370 230311078 229224142 231246224 228559720 238210279 233431365 238238212 ...
##  $ OCCUR_DATE                : chr  "2022-05-05" "2022-07-04" "2019-09-24" "2021-07-01" ...
##  $ OCCUR_TIME                : chr  "00:10:00" "22:20:00" "21:00:00" "23:07:00" ...
##  $ BORO                      : chr  "MANHATTAN" "BRONX" "BRONX" "MANHATTAN" ...
##  $ LOC_OF_OCCUR_DESC         : chr  "INSIDE" "OUTSIDE" NA NA ...
##  $ PRECINCT                  : int  14 48 42 23 113 77 48 49 73 114 ...
##  $ JURISDICTION_CODE         : int  0 0 0 2 0 0 0 0 0 0 ...
##  $ LOC_CLASSFCTN_DESC        : chr  "COMMERCIAL" "STREET" NA NA ...
##  $ LOCATION_DESC             : chr  "VIDEO STORE" "(null)" NA "MULTI DWELL - PUBLIC HOUS" ...
##  $ STATISTICAL_MURDER_FLAG   : logi  TRUE TRUE FALSE FALSE TRUE FALSE ...
##  $ PERP_AGE_GROUP            : chr  "25-44" "(null)" "25-44" NA ...
##  $ PERP_SEX                  : chr  "M" "(null)" "M" NA ...
##  $ PERP_RACE                 : chr  "BLACK" "(null)" "UNKNOWN" NA ...
##  $ VIC_AGE_GROUP             : chr  "25-44" "18-24" "25-44" "25-44" ...
##  $ VIC_SEX                   : chr  "M" "M" "M" "M" ...
##  $ VIC_RACE                  : chr  "BLACK" "BLACK" "BLACK" "BLACK" ...
##  $ X_COORD_CD                : num  986050 1016802 1014493 999061 1042534 ...
##  $ Y_COORD_CD                : num  214231 250581 242565 229912 184647 ...
##  $ Latitude                  : num  40.8 40.9 40.8 40.8 40.7 ...
##  $ Longitude                 : num  -74 -73.9 -73.9 -73.9 -73.8 ...
##  $ Lon_Lat                   : chr  "POINT (-73.9935 40.754692)" "POINT (-73.88233 40.854402)" "POINT (-73.89071440599997 40.832416753000075)" "POINT (-73.94650786199998 40.79772716600007)" ...
##  $ Neighborhood              : chr  "Hell's Kitchen" "Belmont" "East Morrisania" "East Harlem" ...
##  $ neighbourhood_group       : chr  "Manhattan" "Bronx" "Bronx" "Manhattan" ...
##  $ NTA                       : chr  "Chelsea-Hudson Yards" "Belmont" "Crotona Park East" "East Harlem (North)" ...
##  $ Is_Holiday                : logi  FALSE TRUE FALSE FALSE FALSE FALSE ...
##  $ Year                      : int  2022 2022 2019 2021 2021 2021 2021 2021 2021 2021 ...
##  $ Month                     : int  5 7 9 7 6 7 5 12 9 12 ...
##  $ OCCUR_DATETIME            : chr  "2022-05-05 00:10:00" "2022-07-04 22:20:00" "2019-09-24 21:00:00" "2021-07-01 23:07:00" ...
##  $ Sky_Is_Dark               : logi  TRUE TRUE TRUE TRUE FALSE TRUE ...
##  $ NTAType                   : chr  "Residential" "Residential" "Residential" "Residential" ...
##  $ Total_population_nta      : int  69741 35825 30158 64655 43090 15 35825 34623 37952 50225 ...
##  $ CDTA                      : chr  "MN 04" "BX 06" "BX 03" "MN 11" ...
##  $ Number_poverty            : chr  "7,121" "12,919" "8,519" "20,588" ...
##  $ Percent_poverty           : num  11.5 39.6 29.4 32.3 11.4 NA 39.6 24.6 27.3 9.4 ...
##  $ Number_education          : chr  "49,919" "12,455" "13,149" "34,290" ...
##  $ Percent_education         : num  94.6 65.6 69 75.9 83.5 NA 65.6 75.7 83.1 88.4 ...
##  $ incident_rate_by_year_nta : num  0.0272 0.0809 0.0365 0.0619 0.0278 ...
##  $ total_population_boro     : int  18415085 22955825 11660890 20085354 13629328 38087730 29778638 29778638 38087730 13629328 ...
##  $ incident_rate_by_year_boro: num  0.00167 0.00233 0.00229 0.00171 0.00217 ...
# Step 1: Check the structure of your dataset to ensure the necessary columns exist
head(data_clean)
##   INCIDENT_KEY OCCUR_DATE OCCUR_TIME      BORO LOC_OF_OCCUR_DESC PRECINCT
## 1    244608249 2022-05-05   00:10:00 MANHATTAN            INSIDE       14
## 2    247542571 2022-07-04   22:20:00     BRONX           OUTSIDE       48
## 3    202853370 2019-09-24   21:00:00     BRONX              <NA>       42
## 4    230311078 2021-07-01   23:07:00 MANHATTAN              <NA>       23
## 5    229224142 2021-06-07   19:55:00    QUEENS              <NA>      113
## 6    231246224 2021-07-22   01:47:00  BROOKLYN              <NA>       77
##   JURISDICTION_CODE LOC_CLASSFCTN_DESC             LOCATION_DESC
## 1                 0         COMMERCIAL               VIDEO STORE
## 2                 0             STREET                    (null)
## 3                 0               <NA>                      <NA>
## 4                 2               <NA> MULTI DWELL - PUBLIC HOUS
## 5                 0               <NA>                      <NA>
## 6                 0               <NA>   MULTI DWELL - APT BUILD
##   STATISTICAL_MURDER_FLAG PERP_AGE_GROUP PERP_SEX PERP_RACE VIC_AGE_GROUP
## 1                    TRUE          25-44        M     BLACK         25-44
## 2                    TRUE         (null)   (null)    (null)         18-24
## 3                   FALSE          25-44        M   UNKNOWN         25-44
## 4                   FALSE           <NA>     <NA>      <NA>         25-44
## 5                    TRUE           <NA>     <NA>      <NA>         45-64
## 6                   FALSE           <NA>     <NA>      <NA>         25-44
##   VIC_SEX VIC_RACE X_COORD_CD Y_COORD_CD Latitude Longitude
## 1       M    BLACK     986050     214231 40.75469 -73.99350
## 2       M    BLACK    1016802     250581 40.85440 -73.88233
## 3       M    BLACK    1014493     242565 40.83242 -73.89071
## 4       M    BLACK     999061     229912 40.79773 -73.94651
## 5       M    BLACK    1042534     184647 40.67331 -73.78989
## 6       M    BLACK    1004507     182865 40.66858 -73.92698
##                                         Lon_Lat    Neighborhood
## 1                    POINT (-73.9935 40.754692)  Hell's Kitchen
## 2                   POINT (-73.88233 40.854402)         Belmont
## 3 POINT (-73.89071440599997 40.832416753000075) East Morrisania
## 4  POINT (-73.94650786199998 40.79772716600007)     East Harlem
## 5 POINT (-73.78988688199998 40.673306465000046)         Jamaica
## 6  POINT (-73.92697993199994 40.66858395700007)   Crown Heights
##   neighbourhood_group                  NTA Is_Holiday Year Month
## 1           Manhattan Chelsea-Hudson Yards      FALSE 2022     5
## 2               Bronx              Belmont       TRUE 2022     7
## 3               Bronx    Crotona Park East      FALSE 2019     9
## 4           Manhattan  East Harlem (North)      FALSE 2021     7
## 5              Queens         Baisley Park      FALSE 2021     6
## 6            Brooklyn Lincoln Terrace Park      FALSE 2021     7
##        OCCUR_DATETIME Sky_Is_Dark     NTAType Total_population_nta  CDTA
## 1 2022-05-05 00:10:00        TRUE Residential                69741 MN 04
## 2 2022-07-04 22:20:00        TRUE Residential                35825 BX 06
## 3 2019-09-24 21:00:00        TRUE Residential                30158 BX 03
## 4 2021-07-01 23:07:00        TRUE Residential                64655 MN 11
## 5 2021-06-07 19:55:00       FALSE Residential                43090 QN 12
## 6 2021-07-22 01:47:00        TRUE        Park                   15 BK 08
##   Number_poverty Percent_poverty Number_education Percent_education
## 1          7,121            11.5           49,919              94.6
## 2         12,919            39.6           12,455              65.6
## 3          8,519            29.4           13,149              69.0
## 4         20,588            32.3           34,290              75.9
## 5          5,054            11.4           25,416              83.5
## 6           <NA>              NA             <NA>                NA
##   incident_rate_by_year_nta total_population_boro incident_rate_by_year_boro
## 1                0.02724366              18415085                0.001667112
## 2                0.08094906              22955825                0.002330563
## 3                0.03647457              11660890                0.002289705
## 4                0.06186683              20085354                0.001707712
## 5                0.02784869              13629328                0.002171787
## 6               13.33333333              38087730                0.001656702

Education across neighborhoods in NYC

# Calculate the correlation between the poverty percentage and the incident rate
correlation <- cor(data_clean$incident_rate_by_year_nta, data_clean$Percent_education, use = "complete.obs")
print(paste("Correlation coefficient: ", correlation))
## [1] "Correlation coefficient:  -0.274782643621429"
# Create a scatter plot to visualize the relationship
data_clean %>%
  plot_ly(x = ~Percent_education, y = ~incident_rate_by_year_nta, 
          color = ~NTA, colors = "viridis", 
          type = "scatter", mode = "markers",
          text = ~paste("Neighborhood: ", NTA, "<br>Borough: ", BORO, 
                        "<br>% graduated HS: ", Percent_education, 
                        "<br>Incident Rate: ", incident_rate_by_year_nta)) %>%
  layout(title = "Percent graduated high school and Incident Rate in NYC",
         xaxis = list(title = 'Percentage of People graduated in high school'),
         yaxis = list(title = 'Incident Rate'),
         legend = list(title = list(text = 'Neighborhood')))
## Warning: Ignoring 95 observations

Education By Borough

# Assuming your main data frame is named 'data_clean'

# Filter data for Manhattan
manhattan_data <- data_clean %>%
  filter(neighbourhood_group == "Manhattan")

# Filter data for Brooklyn
brooklyn_data <- data_clean %>%
  filter(neighbourhood_group == "Brooklyn")

# Filter data for The Bronx
bronx_data <- data_clean %>%
  filter(neighbourhood_group == "Bronx")

# Filter data for Staten Island
staten_island_data <- data_clean %>%
  filter(neighbourhood_group == "Staten Island")

# Filter data for Queens
queens_data <- data_clean %>%
  filter(neighbourhood_group == "Queens")
# Function to compute and print correlation
compute_correlation <- function(data, borough_name) {
  correlation <- cor(
    data$incident_rate_by_year_nta,
    data$Percent_education,
    use = "complete.obs"
  )
  cat("Correlation coefficient for", borough_name, ":", correlation, "\n")
}

compute_correlation(manhattan_data, "Manhattan")
## Correlation coefficient for Manhattan : -0.2284435
compute_correlation(brooklyn_data, "Brooklyn")
## Correlation coefficient for Brooklyn : -0.248732
compute_correlation(bronx_data, "Bronx")
## Correlation coefficient for Bronx : -0.3996457
compute_correlation(staten_island_data, "Staten Island")
## Correlation coefficient for Staten Island : -0.6452131
compute_correlation(queens_data, "Queens")
## Correlation coefficient for Queens : 0.1006569

Manhattan

compute_correlation(manhattan_data, "Manhattan")
## Correlation coefficient for Manhattan : -0.2284435
# Function to create scatter plot with trend line
# Scatter plot for Manhattan
data_clean |> 
  filter(neighbourhood_group == "Manhattan") |> 
  plot_ly(data = _, x = ~Percent_education, y = ~incident_rate_by_year_nta, 
          color = ~NTA,
          colors = "viridis", 
          type = "scatter",
          mode = "markers",
          text = ~paste("Neighborhood: ", NTA, "<br>Borough: ", neighbourhood_group, 
                        "<br>% graduated HS: ", Percent_education, 
                        "<br>Incident Rate: ", incident_rate_by_year_nta)) |> 
    layout(title = "Percent graduated high school and Incident Rate in Manhattan",
           xaxis = list(title = 'Percentage of People graduated in high school'),
           yaxis = list(title = 'Incident Rate'),
           legend = list(title = list(text = 'Neighborhood')))
## Warning: Ignoring 6 observations

Brooklyn

# Scatter plot for Brooklyn
data_clean |> 
  filter(neighbourhood_group == "Brooklyn") |> 
  plot_ly(data = _, x = ~Percent_poverty, y = ~incident_rate_by_year_nta, 
          color = ~NTA,
          colors = "plasma", 
          type = "scatter",
          mode = "markers",
          text = ~paste("Neighborhood: ", NTA, "<br>Borough: ", neighbourhood_group, 
                        "<br>% graduated HS: ", Percent_education, 
                        "<br>Incident Rate: ", incident_rate_by_year_nta)) |> 
   layout(title = "Percent graduated high school and Incident Rate in Brooklyn",
           xaxis = list(title = 'Percentage of People graduated in high school'),
           yaxis = list(title = 'Incident Rate'),
           legend = list(title = list(text = 'Neighborhood')))
## Warning: Ignoring 13 observations

The Bronx

# Scatter plot for The Bronx
data_clean |> 
  filter(neighbourhood_group == "Bronx") |>
  plot_ly(data = _, x = ~Percent_poverty, y = ~incident_rate_by_year_nta, 
          color = ~NTA,
          colors = "magma", 
          type = "scatter",
          mode = "markers",
          text = ~paste("Neighborhood: ", NTA, "<br>Borough: ", neighbourhood_group, 
                        "<br>% graduated HS: ", Percent_education, 
                        "<br>Incident Rate: ", incident_rate_by_year_nta)) |> 
   layout(title = "Percent graduated high school and Incident Rate in The Bronx",
           xaxis = list(title = 'Percentage of People graduated in high school'),
           yaxis = list(title = 'Incident Rate'),
           legend = list(title = list(text = 'Neighborhood')))
## Warning: Ignoring 25 observations

Queens

# Scatter plot for Queens
data_clean |> 
  filter(neighbourhood_group == "Queens") |>
  plot_ly(data = _, x = ~Percent_poverty, y = ~incident_rate_by_year_nta, 
          color = ~NTA,
          colors = "inferno", 
          type = "scatter",
          mode = "markers",
          text = ~paste("Neighborhood: ", NTA, "<br>Borough: ", neighbourhood_group, 
                        "<br>% graduated HS: ", Percent_education, 
                        "<br>Incident Rate: ", incident_rate_by_year_nta)) |> 
   layout(title = "Percent graduated high school and Incident Rate in Queens",
           xaxis = list(title = 'Percentage of People graduated in high school'),
           yaxis = list(title = 'Incident Rate'),
           legend = list(title = list(text = 'Neighborhood')))
## Warning: Ignoring 11 observations

Staten Island

# Scatter plot for Staten Island
data_clean |> 
  filter(neighbourhood_group == "Staten Island") |>
  plot_ly(data = _, x = ~Percent_poverty, y = ~incident_rate_by_year_nta, 
          color = ~NTA,
          colors = "inferno", 
          type = "scatter",
          mode = "markers",
          text = ~paste("Neighborhood: ", NTA, "<br>Borough: ", neighbourhood_group, 
                        "<br>% graduated HS: ", Percent_education, 
                        "<br>Incident Rate: ", incident_rate_by_year_nta)) |> 
   layout(title = "Percent graduated high school and Incident Rate in Queens",
           xaxis = list(title = 'Percentage of People graduated in Staten Island'),
           yaxis = list(title = 'Incident Rate'),
           legend = list(title = list(text = 'Neighborhood')))
## Warning: Ignoring 5 observations